# coding=utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import copy
import json
import math
import re
import six
import tensorflow as tf



class BertModel(object):
    """
    BERT model("Bidirectional Embedding Representations from a Transformer")
    """
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=True,
                 scope=None):
        """Constructor for BertModel.

            Args:
              config: `BertConfig` instance.
              is_training: bool. rue for training model, false for eval model. Controls
                whether dropout will be applied.
              input_ids: int32 Tensor of shape [batch_size, seq_length].
              input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
              token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
              use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
                embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
                it is must faster if this is True, on the CPU or GPU, it is faster if
                this is False.
              scope: (optional) variable scope. Defaults to "bert".

            Raises:
              ValueError: The config is invalid or one of the input tensor shapes
                is invalid.
            """
        config = copy.deepcopy(config)

        # 如果非训练
        if not is_training:
            config.hidden_drop_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

        with tf.variable_scope("bert", scope):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids:
                (self.embedding_output, self.embedding_table) = embedding_lookup(
                    input_ids=input_ids,
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name="word_embedding",
                    use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer normalize and perform dropout
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_drop_prob
                )
            with tf.variable_scope("encoder"):
            #     This converts a 2D mask of shape [batch_size, seq_length] to a 3d mask of shap [batch_size, seq_length, seq_length] which is used for the attention scores.
                attention_mask = create_attention_mask_from_input_mask(input_ids, input_mask)

            #     Run the stacked transformer.
            #     'sequence_output' shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_drop_prob,
                    attention_probs_dropout_prob=config.attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)
            self.sequence_output = self.all_encoder_layers[-1]

            # The "pooler" converts the encoded sequence tensor of shapes
            # [batch_size, seq_length, hidden_size] to a tensor of shapes
            # [batch_size, hidden_size]. This is necessary for segment_level
            # (or segment_pair_level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre_trained
                # 这里0:1是将sequence_output的第二个维度变成1,然后tf.squeeze是将张量中的维度为1的去掉,而axis制定了在那个位置的1维度去掉,这里axis1代表第二个位置的1去掉
                first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=create_initializer(initializer_range=)
                )


# 辅助函数
def gelu(input_tensor):
    cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
    return input_tensor * cdf

def get_activation(activation_string):
    """
    Maps a string to a Python function, e.g.. "relu"=>'tf.nn.relu'
    :param activation_string: String name of the activation function.
    :return:
        Apython function corresponding to the activation function. If
        'actiontion_string' is None, empty, or "linear", this will return
        None. If 'activation_string' is not a string, it will return '
        activation_string.
    :Raises:
        ValueError: The 'activation_string' does not correspond to a known
                    activation.
    """
    if not isinstance(activation_string, six.string_types):
        return activation_string

    if not activation_string:
        return None

    act = activation_string.lower()
    if act == "linear":
        return None
    elif act == "relu":
        return tf.nn.relu
    elif act == "gelu":
        return gelu
    elif act == "tanh":
        return tf.nn.tanh
    else:
        return ValueError("Unsupported activation: %s" % act)


def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.2,
                    do_return_2d_tensor=False,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):
    """
    Performs multi_headed attention from 'from tensor' to 'to_tensor'

    This is an implementation of multi_headed attention based on "Attention is all you Need".
    If 'from_tensor' and 'to_tensor' are the same, then this is a self-attention. Each timestep
    in 'from_tensor' attends to the corresponding sequence in 'to_tensor', and returns a fixed-width
    vector.

    This function first projects 'from_tensor' into a "query" tensor and "to_tensor" into "key" and "
    value" tensors. These are (effectively) a list of tensors of length 'num_attention_heads', where
    each tensor is of shape [batch_size, seq_length, size_per_head].

    Then, the query and key tensor are dot_producted and scaled. These are softmaxed to obtain attention
    probabilities. The value tensors are then interpolated by these probabilities, then concatenated back
    to a single tensor and returned.

    In practice, the multi-headed attention are done with transposes and reshapes rather than actual
    separate tensors.

    :param from_tensor:     float Tensor of shape [batch_size, from_seq_length, from_width]
    :param to_tensor:       float Tensor of shape [batch_size, to_seq_length, to_width]
    :param attention_mask:  (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length].
                            The values should be 1 or 0. The attention scores will effectively be set to
                            -infinity for any positions in the mask that are 0, and will be unchanged for
                            positions that are 1.
    :param num_attention_heads:     int. Number of attention heads.
    :param size_per_head:       int. Size of each attention head.
    :param query_act:       (optional) Activation function for the query transform.
    :param key_act:     (optional) Activation function for the key transform.
    :param value_act:       (optional) Activation function for the value transform.
    :param attetion_probs_dropout_prob:
    :param initializer_range:       float. Range of the weight initializer.
    :param do_return_2d_tensor:     bool. If True, the output will be of shape
                                    [batch_size * from_seq_length, num_attention_heads * size_per_head].
                                    If Fales, the output will be of shape
                                    [batch_size, from_seq_length, num_attention_heads * size_per_head].
    :param batch_size:      (optional) int. If the input is 2D, this might be the batch size of 3D version
                                            of the 'from_tensor' and 'to_tensor'.
    :param from_seq_length:     (Optional) If the input is 2D, this might be the seq length of the 3D version
                                            of the 3D version of the 'from_tensor'
    :param to_seq_length:       (optional) If the input is 2D, this might be the seq length of the 3D version
                                            of the 3D version of the 'to_tensor'
    :return:
        float Tensor of shape [batch_size, from_seq_length, num_attention_heads * size_per_head].
        (If 'do_return_2D_tensor' is true, this wil be of shape
        [batch_size * from_seq_length, num_attention_heads * size_per_head]).
    :raises:
        ValueError: Any of hte arguements or tensor shapes are invalid.
    """
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, width):
        output_tensor = tf.reshape(input_tensor, [batch_size, seq_length, num_attention_heads, width])

        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    from_shape = get_shape_list(from_tensor, expected_rank=[2,3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2,3])

    if len(from_shape) != len(to_shape):
        raise ValueError("The rank of \'from_tensor\' must match the ranke of \'to_tensor\'.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for 'batch_size', 'from_seq_length', and 'to_seq_length' "
                "must all be specified."
            )

    # Scalar dimensions referenced here;
    # B = batch_size (number of sequences)
    # F = 'from_tensor' sequence_length
    # T = 'to_tensor' sequence_length
    # N = 'num_attention_heads'
    # H = 'size_per_head'

    from_tensor_2d = reshape_to_matrix(from_tensor)
    to_tensor_2d = reshape_to_matrix(to_tensor)

    # 'query_layer' = [B*F, N*H]
    query_layer = tf.layers.dense(from_tensor_2d,
                                 num_attention_heads*size_per_head,
                                 activation=query_act,
                                 name='query',
                                 kernel_initializer=create_initializer(initializer_range))

    # 'key_layer' = [B*T, N*H]
    key_layer = tf.layers.dense(to_tensor_2d,
                                num_attention_heads*size_per_head,
                                activation=key_act,
                                name="key",
                                kernel_initializer=create_initializer(initializer_range))

    # 'value_layer' = [B*T, N*H]
    value_layer = tf.layers.dense(to_tensor_2d,
                                  num_attention_heads*size_per_head,
                                  activation=value_act,
                                  name="value",
                                  kernel_initializer=create_initializer(initializer_range))

    # 'query_layer' = [B, N, F, H]
    query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_seq_length,size_per_head)

    # 'key_layer' = [B, N, T, H]
    key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head)

    # Take the dot product between "query" and "key" to get the raw attention scores,
    # 'attention_scores' = [B,N,F,T]
    # Attention = softmax(QKt(转置)/开根(dk))V
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # 'attention_mask' = [B, 1, F, T]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions
        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_mask += adder

    # Normalize the attention scores to probabilities.  attention_probs = [B,N,F,T]
    attention_probs = tf.nn.softmax(attention_scores)

    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

    # value_layer = [B,T,N,H]
    value_layer = tf.reshape(value_layer, [batch_size, to_seq_length, num_attention_heads, size_per_head])
    # value_layer = [B,N,T,H]
    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

    # context_layer = [B,N,F,H]
    context_layer = tf.matmul(attention_probs, value_layer)

    # context_layer = [B,F,N,H]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

    if do_return_2d_tensor:
        # context_layer = [B*F, N*H]
        context_layer = tf.reshape(context_layer, [batch_size * from_seq_length, num_attention_heads * size_per_head])
    else:
        # context_layer = [B, F, N*H]
        context_layer = tf.reshape(context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head])

    return context_layer


def reshape_from_matrix(output_tensor, orig_shape_list):
    """
        Reshape a rank 2 tensor,to its original rank >= 2 tensor
    :param output_tensor:
    :param orig_shape_list:
    :return:
    """
    if len(orig_shape_list) == 2:
        return output_tensor

    output_shape = get_shape_list(output_tensor)#[6,4]

    orig_dims = get_shape_list[0:-1]#[2,3,4]->[2,3]
    width = output_shape[-1]#[4]

    return tf.reshape(output_tensor, orig_dims + [width])#[2,3,4]将二维还原成初始的三维

def reshape_to_matrix(input_tensor):
    """
    Resapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).
    :param input_tensor:
    :return:
    a shape:[2,3,4]
    width = 4
    output_tensor = tf.reshape(a.[-1,4])
    output_tensor shape: [6,4]
    """
    ndims = input_tensor.shape.ndims
    if ndims < 2:
        raise ValueError("input tensor must have at least rank 2. Shape = %s" %
                         (input_tensor.shape))

    if ndims == 2:
        return input_tensor

    width = input_tensor.shape[-1]
    output_tensor = tf.reshape(input_tensor, [-1, width])
    return output_tensor


def create_attention_mask_from_input_mask(from_tensor, to_mask):
    """
    Create 3D attention mask from a 2D tensor mask
    :param from_tensor: 2D or 3D Tensor of shap [batch_size, from_seq_length,...]
    :param to_mask: int32 Tensor of shape [batch_size, to_seq_length+]
    :return:
    """
    from_shape = get_shape_list(from_tensor, expected_rank=[2,3])
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]

    to_shape = get_shape_list(to_mask, expected_rank=2)
    to_seq_length = to_shape[1]

    to_mask = tf.cast(
        tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32
    )
#     we don't assum that 'from_tensor' is a mask(although it could be). we
#     don't actually care if we attend *from* padding tokens (only *to* padding)
#     tokens so we create a tensor of all ones.

#     'broadcast_ones' = [batch_size, from_seq_length, 1]
    broadcast_ones = tf.ones(
        shape=[batch_size, from_seq_length, 1], dtype=tf.float32
    )

#     here we broadcast along two dimensions to create the mask
    mask = broadcast_ones * to_mask#[batch, from_seq_length, to_seq_length]

    return mask

def dropout(input_tensor, dropout_prob):
    """
    Perform dropout
    :param input_tensor:  float Tensor.
    :param dropout_prob: pythonfloat. The probability of dropping out a value(Not of *keepling* a dimension as in "tf.nn.dropout')
    :return: a version of input_tensor with ddropout applied
    """
    if dropout_prob is None or dropout_prob == 0.0:
        return input_tensor

    output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
    return output


def layer_norm(input_tensor, name=None):
    return tf.contrib.layer.layer_norm(
        inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name
    )

def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
    """
    Run layer normalization followed by dropout
    :param input_tensor:
    :param dropout_prob:
    :param name:
    :return:
    """
    output_tensor = layer_norm(input_tensor, name)
    output_tensor = dropout(output_tensor, dropout_prob)
    return output_tensor

def create_initializer(initializer_range=0.02):
    """
    Creates a 'truncated normal initializer' with the given range.
    :param initializer_range:
    :return:
    """
    return tf.truncated_normal_initializer(stddev=initializer_range)

def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    """
    Performs variout post_processing on a word embedding tensor.

    :param input_tensor: float Tensor of shape[batch_size, seq_length, embedding_size]
    :param use_token_type:  bool. Whether to add embeddings for "token_type_ids".
    :param token_type_ids:  (optional) int32 Tensor of shape[batch_size, seq_length]
    :param token_type_vocab_size:   int. The vocabulary size of 'token_type_ids'
    :param token_type_embedding_name:   string the name of the embedding table variable for token type ids.
    :param use_position_embeddings: bool. Whether to add position embeddings for the position of each token in sequence.
    :param position_embedding_name: String The name of the embedding table variable for positional embeddings.
    :param initializer_range:   float. Range of the weight initialization.
    :param max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be  longer than the sequence length of input_tensor but cannot be shorter
    :param dropout_prob:    float. Dropout probability applied to the final output tensor.
    :return:
        float tensor with same shape as 'input_tensor'
    """
    input_shape = get_shape_list(input_tensor, expected_rank=3)#[batch_size, seq_length, embedding_size]
    batch_size = input_shape[0]#[batch_size]
    seq_length = input_shape[1]#seq_length
    width = input_shape[2]#embedding_size

    if seq_length > max_position_embeddings:
        raise ValueError("The seq length (%d) cannot be greater than"
                     "'max_position_embeddings' (%d)" %
                     (seq_length, max_position_embeddings))
    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("token _tpye_ids must be specified if"
                             "use_token_type is True")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range)
        )
        # This vocab will be small so we always do one_hot here, since it is always faster for a small vocabulary
        # [batch_size, seq_length]->token_type_ids
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        full_position_embeddings = tf.get_variable(
            name=position_embedding_name,
            shape=[max_position_embeddings, width],
            initializer=create_initializer(initializer_range)
        )
        if seq_length < max_position_embeddings:
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
        else:
            position_embeddings = full_position_embeddings#[seq_length, width]

        num_dims = len(output.shape.as_list())
        # Only the last two dimensions are relevant('seq_length' and 'width'), so
        # We broadcast among the first dimensions, which is typically just
        # the batch_size
        position_broadcast_shape = []
        for _ in range(num_dims-2):
            position_broadcast_shape.append(1)
        position_broadcast_shape.extend(([seq_length, width]))#[1, seq_length, width]
        position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape)
        output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output

def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name="word_embeddinds",
                     use_one_hot_embeddings=False):
    """
    Looks up words embeddings for id tensor.
    :param input_ids:   int32 Tensor of shape [batch_size, seq_length] containing word ids.
    :param vocab_size:  int. Size of the embedding vocabulary.
    :param embedding_size:  int. Width of the word embedding.
    :param initializer_range:   float. Embedding initialization range.
    :param word_embedding_name: string. Name of the embedding table.
    :param use_one_hot_embeddings:  bool. If True, use one_hot method for word embeddings. If False, use 'tf.nn.embedding_lookup(). One Hot is better for TPUs.
    :return: float Tensor of shape [batch_size, seq_length, embedding_size]
    """
    if input_ids.shape.ndims == 2:
        input_ids = tf.expand_dims(input_ids, axis=[-1])

    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=create_initializer(initializer_range)
    )

    if use_one_hot_embeddings:
        flat_input_ids = tf.reshape(input_ids, [-1])
        one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
        output = tf.matmul(one_hot_input_ids, embedding_table)
    else:
        output = tf.nn.embedding_lookup(embedding_table, input_ids)

    input_shape = get_shape_list(input_ids)
    output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
    return (output, embedding_table)

def assert_rank(tensor, expected_rank, name):
    """
    Raises an exception if the tensor rank is not of the expected rank
    :param tensor: A tf.tensor to check the rank of.
    :param expected_rank:   Python integer or list of integers, expected rank.
    :param name: Optional name of the tensor for the error message.

    :Raises:
        ValueError: If the expected shape doesn't match the actual shape.
    """
    # if name is None:
        # name = tensor.name

    expected_rank_dict = {}
    if isinstance(expected_rank, six.integer_types):
        expected_rank_dict[expected_rank] = True
    else:
        for x in expected_rank:
            expected_rank_dict[x] = True

    # 输出tensor的维度
    actual_rank = tensor.shape.ndims
    if actual_rank not in expected_rank_dict:
        scope_name = tf.get_variable_scope().name
        raise ValueError(
            (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))
        )

def get_shape_list(tensor, expected_rank=None, name=None):
    """
    Returns a list of the shape of tensor preferring static dimensions
    :param tensor: A tf.Tensor object to find shape of
    :param expected_rank:   (Optional) int. The expected rank of 'tensor', If this is specified and the 'tensor' has a different rank, and exception will be thrown.
    :param name:    Optional name of the tensor for the error message.
    :return:
        A list of dimensions of the shape of tensor. All static dimensions will be returned
        as python integers, and dynamic dimensions will be returned as tf.Tensor scalars.
    """
    # if name is None:
    #     name = tensor.name
    # 判定tensor的shape和expected_rank是否是一致的,并定义raise error
    if expected_rank is not None:
        assert_rank(tensor, expected_rank, name)

    shape = tensor.shape.as_list()

    non_static_indexes = []
    for (index, dim) in enumerate(shape):
        if dim is None:
            non_static_indexes.append(index)

    if not non_static_indexes:
        return shape

    dyn_shape = tf.shape(tensor)
    for index in non_static_indexes:
        shape[index] = dyn_shape[index]
    return shape

def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.2,
                      do_return_all_layers=False):
    """
    Multi-headed, multi-layer Transformer from "Attention is All You Need"
    This is almost an exact implementation of the original Transformer encoder.

    :param input_tensor:    floate Tensor of shape [batch_size, seq_length, hidden_size]
    :param attention_mask:  (optional) int32 tensor of shape [batch_size, seq_length, seq_length]
                            with 1 for positions that can be attended to and 0 in positions that should not be.
    :param hidden_size:     int Hidden size of the Transformer.
    :param num_hidden_layers        int. Number of layers(blocks) in the transformer
    :param num_attention_heads:    int. Number of attention heads in the transformer.
    :param intermediate_size:       int. The size of the "intermediate" (a.k.a., feed forward) layer.
    :param intermediate_act_fn:     function. The non_linear activation function to apply to the output of the intermediate/feed-forward layer.
    :param hidden_dropout_prob:     float. Dropout probability for the hidden layers.
    :param attention_probs_dropout_prob:    float. Dropout probability of the attention probabilities.
    :param initializer_range:       float. Range of the initializer(stddev of truncated normal)
    :param do_return_all_layers:    Whether to also return all layers or just the final layer.
    :return:
        float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer.

    :Raises:
        ValueError: A tensor shape or parameter is invalid.
    """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) id not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads)
        )

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # The Transformer performs sum residuals on all layers so the input needs to be the same as the hidden size
    if input_width != hidden_size:
        raise ValueError("The width of the input tensor (%d) != hiddensize (%d)" %
                         (input_width, hidden_size))

    # We keep the representation as a 2D tensor to avoid reshape it back and forth from a 3D tensor to a 2D tensor. Reshape are normally
    # free on the GPU/CUP but may not be free on the TPU, so we want to minimize them to help the optimizer.
    prev_output = reshape_to_matrix(input_tensor)

    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope("layer_ %d" % layer_idx):
            layer_input = prev_output

            with tf.variable_scope("attention"):
                attention_heads = []
                with tf.variable_scope("self"):
                    attention_head = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length
                    )
                    attention_heads.append(attention_head)

                attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # In the case projection of 'hidden_size' then add a residual
                    # with 'layer_input'
                with tf.variable_scope("output"):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=create_initializer(initializer_range)
                    )
                    attention_output = dropout(attention_output, hidden_dropout_prob)
                    # residual connection
                    attention_output = layer_norm(attention_output + layer_input)

            # The activation is only applied to the "intermediate" hidden layer
            # attention计算之后的线性输出  隐藏神经元是4H  如attention int Hidden size of the Transformer768则intermediate_size=4H=3072
            with tf.variable_scope("imtermediate"):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=create_initializer(initializer_range)
                )

            # Down_project back to 'hidden_size' then add the residual.
            with tf.variable_scope("output"):
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=create_initializer(initializer_range)
                )
                layer_output = dropout(layer_output, hidden_dropout_prob)
                layer_output = layer_norm(layer_output + intermediate_output)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
            final_output = reshape_from_matrix(layer_output, input_shape)
            final_outputs.append(final_output)
        return final_outputs
    else:
        final_output = reshape_from_matrix(prev_output, input_shape)
        return final_output